In [2]:
import pandas as pd
import dash
import dash_core_components as dcc
import dash_html_components as html
import plotly.express as px
import plotly.graph_objects as go
from urllib.request import urlopen
import json

Choropleth Map of Known Cases

First, obtain JSON data for choropleth map.

In [109]:
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)

Because the map maps FIPS data to identify counties, the datasets with Georgia counties also need to have the FIPS codes. FIPS were obtained and appended to the dataset in 'get-fips.ipynb'.

In [110]:
county_df = pd.read_csv('countyFIPS0423.csv', dtype={"FIPS": str})
county_df.head()
Out[110]:
Name FIPS Cases Deaths Percent Deaths
0 Appling 13001 49.0 3.0 6.122449
1 Atkinson 13003 7.0 0.0 0.000000
2 Bacon 13005 20.0 1.0 5.000000
3 Baker 13007 19.0 2.0 10.526316
4 Baldwin 13009 161.0 4.0 2.484472

The choropleth map shows which areas have the most numbers of cases relative to each other. Hover for exact numbers.

In [5]:
fig1 = px.choropleth(county_df, geojson=counties, locations='FIPS', color='Cases',
                           color_continuous_scale="Redor",
                           range_color=(0, county_df["Cases"].max()),
                           scope="usa",
                           hover_name = "Name",
                           hover_data = ["Cases"],
                           labels={'Cases':'Cases'}
                          )
fig1.update_geos(fitbounds="locations", visible=False)
fig1.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig1.show()

Distribution of Tests by Laboratories

In [46]:
tests_df = pd.read_csv('testing0423.csv')
#Set first column as index for easier data extraction
#tests_df.set_index(tests_df.columns[0], inplace=True)
tests_df
Out[46]:
Entity Positive Tests Total Tests Percent Positive
0 Commercial Lab 20634 94525 21.829146
1 Gphl 1249 6537 19.106624

Show tests administered by commercial and non-commercial labs as a propotion of the total. As we can see, commercial labs have administered significantly more tests than the GPHL.

In [47]:
pie_total_tests = px.pie(tests_df,
                         title="Tests Administered by Entity",
                         values='Total Tests', 
                         names=['Commercial Laboratories', 'Georgia Public Heath Laboratory'], 
                         #names = 'Entity',
                         hover_name = ['Commercial Laboratories', 'Georgia Public Heath Laboratory'],
                         hover_data = ['Total Tests'],
                        )

pie_total_tests.show()

Show the proportion of tests that were positive and negative.

In [257]:
bar_testing = go.Figure(data=[
    go.Bar(name='Negative Tests', x=tests_df['Entity'], y=tests_df['Total Tests']-tests_df['Positive Tests'], marker_color='mediumseagreen'),
    go.Bar(name='Positive Tests', x=tests_df['Entity'], y=tests_df['Positive Tests'], marker_color='firebrick')
])
# Change the bar mode
bar_testing.update_layout(barmode='stack')

Georgia began releasing information about the races of people affected by COVID-19. While most are unknown or unrecorded, we can get insights about people of known races.

In [117]:
race_df = pd.read_csv('race0423.csv')
race_df.head()
Out[117]:
Race Ethnicity Cases Deaths Percent Deaths
0 Black Or African American Hispanic/Latino 115 4 3.478261
1 Black Or African American Non-Hispanic/Latino 5577 460 8.248162
2 Black Or African American Unknown 1388 10 0.720461
3 White Hispanic/Latino 623 16 2.568218
4 White Non-Hispanic/Latino 3918 333 8.499234

Show the number of cases per race as a proportion of the total. Note that you can click races on the key to add/remove them. Of those whose races are known, we can see that Black or African-American people have been most affected.

In [10]:
pie_total_race = px.pie(race_df,
                         title="Total Cases By Race",
                         values='Cases', 
                         names='Race'
                        )
pie_total_race.update_traces(textposition='outside', textinfo='percent+label')
pie_total_race.show()

The dataset also includes information about ethnicity (whether Hispanic, non-Hispanic, or unknown). This stacked bargraph will show the total cases by both race and ethnicity. People of unknown race have been excluded.

In [122]:
known_race = race_df.loc[race_df['Race'] != "Unknown"]


fig = px.bar(known_race,
             title = "Total Cases by Race and Ethnicity (When Known)",
             x="Race",
             y="Cases",
            color = "Ethnicity")
fig.show()

Deaths Data

This histogram shows the distribution of the ages at death form COVID-19 by gender. A box and whisker plot is maginally added for further information. As we can see, males have been dying younger than females.

In [261]:
deaths_df = pd.read_csv('deaths0423.csv')
In [260]:
fig = px.histogram(deaths_df, title="Georgia COVID-19 Deaths by Age", 
                   x="Age", 
                   color="Gender", 
                   marginal="box", #box-and-whiskers distribution
                   color_discrete_map = {'MALE': 'navy', 'FEMALE': ' MediumVioletRed', 'UNKNOWN':'black'})
fig.show()
In [238]:
#TODO: Add statistical tests to see if there differences are statistically significant

The vast majority of people who have died had known underlying conditions.

In [107]:
deaths_df['Underlying'].value_counts()

underlying = px.pie(deaths_df,
                    title="Presence of Underlying Conditions Among the Deceased",
                    values=deaths_df['Underlying'].value_counts(), 
                    names=['Underlying condition present', 'Unknown condition', 'Underlying condition not present'],
                    #hover_name = ['Underlying condition present', 'Unknown condition', 'Underlying condition not present'],
                    #hover_data = ['Underlying'].value_counts()
                        )
underlying.update_traces(textposition='outside', textinfo='percent+label')

underlying.show()

Verify that the labels were labeled correctly.

In [90]:
deaths_df['Underlying'].value_counts()
Out[90]:
Yes    575
Unk    281
No      25
Name: Underlying, dtype: int64

Which counties have had the most deaths as a proportion of the total number of cases? I previously added a "Percent Deaths" column to the deaths data set. Here I will make another choropleth map to show this data. Hover data also includes the total number of cases and deaths for better perspective.

In [259]:
deaths_map = px.choropleth(county_df, geojson=counties, locations='FIPS', color='Percent Deaths',
                           color_continuous_scale="YlOrRd",
                           range_color=(0, county_df["Percent Deaths"].max()),
                           scope="usa",
                           hover_name = "Name",
                           hover_data = ["Cases","Deaths","Percent Deaths"]
                           #labels={'Cases':'Cases'}
                          )
deaths_map.update_geos(fitbounds="locations", visible=False)
deaths_map.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
deaths_map.show()

Number of Cases Over Select Dates

In [124]:
import glob

Function to merge datasets when necessary

In [207]:
#Function to append dates to dataframes then merge the data sets of that type together
#This is to make it easier to track changes over time
#Input MUST be a string in the format 'dataset*.csv' (example: 'summary*.csv') 
# OR a valid date can replace the * if only one file is needed.

def track_dates(file_names):
    
    #empty DataFrame by which dataframes will be stacked by date
    merged_df = pd.DataFrame()
    
    for file in glob.glob(file_names):
        #take date from file names
        date = file[-8:-4]
        
        #append new Date column to that data
        df = pd.read_csv(file)
        df = df.assign(Date = date)
        
        merged_df = merged_df.append(df)
        
    return merged_df
In [208]:
tracked_cases = track_dates('summary*.csv')
In [209]:
#Verify that the files have merged with dates
tracked_cases
Out[209]:
Status Confirmed Cases Percent of Total Date
0 Total 21883 100.00 0423
1 Hospitalized 4154 18.98 0423
2 Deaths 881 4.03 0423
0 Total 22491 100.00 0424
1 Hospitalized 4322 19.22 0424
2 Deaths 899 4.00 0424
0 Total 23216 100.00 0425
1 Hospitalized 4353 18.75 0425
2 Deaths 907 3.91 0425
0 Total 23481 100.00 0426
1 Hospitalized 4377 18.64 0426
2 Deaths 916 3.90 0426
0 Total 24579 100.00 0427
1 Hospitalized 4611 18.76 0427
2 Deaths 1022 4.16 0427
0 Total 24854 100.00 0428
1 Hospitalized 4738 19.06 0428
2 Deaths 1036 4.17 0428
In [237]:
tracked_total = tracked_cases.loc[tracked_cases["Status"]=="Total"]
tracked_deaths = tracked_cases.loc[tracked_cases["Status"]=="Deaths"]
tracked_hospitalized = tracked_cases.loc[tracked_cases["Status"]=="Hospitalized"]

fig = go.Figure()

fig.add_trace(go.Scatter(x=tracked_total['Date'], y=tracked_total['Confirmed Cases'],
                    mode='lines+markers',
                    name='Total Cases',
                    text = ["April 23","April 24","April 25","April 26","April 27","April 28"]))

fig.add_trace(go.Scatter(x=tracked_hospitalized['Date'], y=tracked_hospitalized['Confirmed Cases'],
                    mode='lines+markers',
                    name='Total Hospitalized',
                    text = ["April 23","April 24","April 25","April 26","April 27","April 28"]))

fig.add_trace(go.Scatter(x=tracked_deaths['Date'], y=tracked_deaths['Confirmed Cases'],
                    mode='lines+markers',
                    name='Total Deaths',
                    text = ["April 23","April 24","April 25","April 26","April 27","April 28"]))

fig.show()